The following links below contain publicly available data about A&E attendances across Scottish Hospitals published by Public Health Scotland. The data used was obtained as of the 26th May 2024
The monthly data on A&E attendances can be obtained from here: https://www.opendata.nhs.scot/dataset/monthly-accident-and-emergency-activity-and-waiting-times/resource/022c3b27-6a58-48dc-8038-8f1f93bb0e78
The weekly data on A&E waiting times can be obtained from here:
# Set the working directory
setwd("/Users/nng3/Masters/Semester 3/Project 1")
# Function to compute the MSE for demand for A&E
mse.load <- function(model, test_data){
pred <- cbind(test_data, predict.bam(model,test_data,se=TRUE,type="response"))
mse <- mean((pred$total_attendances-pred$fit)^2)
return(mse)
}
# Function to compute the RMSE for hourly a&e attendances
rmse.load.hr <- function(model, test_data){
pred <- predict.bam(model,test_data,se=TRUE,type="response")
rmse <- sqrt(mean((test_data$attendances-pred$fit)^2))
return(rmse)
}
# Function to compute the RMSE for waiting times
rmse.wait <- function(model, test_data){
pred <- cbind(test_data, predict.bam(model,test_data,se=TRUE,type="response"))
rmse <- sqrt(mean((test_data$prop.w4hrs-pred$fit)^2))
return(rmse)
}
# Function to compute the baseline RMSE for waiting times
wait.baseline.rmse <- function(train_data, test_data) {
# Compute the last year's (364 days ago) julian day for the test data
train_data_baseline <- train_data %>%
filter(julian %in% (unique(test_data$julian) - 364)) %>%
select(loc, julian, prop.w4hrs) %>%
rename(last_julian = julian)
# Compute the last julian in the test data to join on
test_data <- test_data %>%
mutate(last_julian = julian - 364)
# Merge baseline forecast with test data
baseline_forecast <- merge(test_data, train_data_baseline, by = c("last_julian", "loc"))
# Calculate RMSE of the baseline
rmse <- sqrt(mean((baseline_forecast$prop.w4hrs.x - baseline_forecast$prop.w4hrs.y)^2, na.rm = TRUE))
return(rmse)
}
# Function to calculate the baseline RMSE
load.baseline.rmse <- function(train_data, test_data) {
# Filter the baseline data based on cmonth - baseline_period
baseline_data <- train_data %>%
filter(cmonth %in% (unique(test_data$cmonth) - 12))
baseline_forecast <- merge(test_data, baseline_data, by = c("loc", "day.num", "hr.num","month"), suffixes = c(".test", ".baseline"))
# Calculate RMSE of the baseline
rmse <- sqrt(mean((baseline_forecast$attendances.test - baseline_forecast$attendances.baseline)^2, na.rm = TRUE))
return(rmse)
}
# Function to sum every 168 observations in forecast data
sum_168 <- function(x) {
tapply(x, (seq_along(x)-1) %/% 168, sum)
}
# Function to sum every 6th element starting from different initial points
sum_by_position <- function(vec) {
result <- numeric(6)
# Loop through the positions (1 to 6) and sum the corresponding elements
for (i in 1:6) {
indices <- seq(i, 180, by = 6)
result[i] <- sum(vec[indices])
}
return(result)
}
# Read data and drop the country variable (only one unique obs: "S92000003") and department type for the waiting times dataset
load <- read.csv("~/Masters/Semester 3/Project 1/weekly_load.csv")[,-2]
wait <- read.csv("~/Masters/Semester 3/Project 1/weekly_wait.csv")[,-2]
# Changing col names
names(load) <- c("monthyr", "HBT", "loc", "dpttype", "day", "week", "hr", "inout", "attendances")
names(wait) <- c("wed", "HBT", "loc", "dpttype","attendances", "w4hrs", "gr4hrs", "per.w4hrs", "gr8hrs", "per.gr8hrs", "gr12hrs", "per.gr12hrs")
# Changing observation names for readability
load$dpttype[load$dpttype == "Emergency Department"] <- "AE"
load$dpttype[load$dpttype == "Minor Injury Unit or Other"] <- "MIU"
wait$dpttype[wait$dpttype == "Emergency Department"] <- "AE"
wait$dpttype[wait$dpttype == "Minor Injury Unit or Other"] <- "MIU"
# Filter to focus only the AE department not MIU
load <- load[load$dpttype=="AE",]
wait <- wait[wait$dpttype=="AE",]
# Convert the date variable
wait$date <- as.Date(as.character(wait$wed), format("%Y%m%d"))
# Adjust the date index for load
# Change the hours 00:00-00:59 for example to start at 0 to 23
load$hr.num <- as.numeric(sub(":.*", "", load$hr))
# Change the day to 1,2,...,7 days of the week
day_to_num <- c("Friday"=5, "Monday"=1, "Saturday"=6, "Sunday"=7, "Thursday"=4, "Tuesday"=2, "Wednesday"=3)
load$day.num <- day_to_num[load$day]
# Segregate month and year into separate columns
load$year <- as.numeric(substr(load$monthyr, 1, 4))
load$month <- as.numeric(substr(load$monthyr, 5, 6))
wait$year <- as.numeric(substr(wait$date, 1, 4))
wait$month <- as.numeric(substr(wait$date, 6, 7))
wait$dom <- as.numeric(substr(wait$date, 9, 10))
# Specify the number of days from the start day
wait$julian <- julian(wait$date, origin=as.Date("2015-02-22"))
# Create factor variables
load$HBT <- as.factor(load$HBT)
wait$HBT <- as.factor(wait$HBT)
wait$loc <- as.factor(wait$loc)
load$inout <- as.factor(load$inout)
load$loc <- as.factor(load$loc)
# Create proportions
wait$per.gr4hrs <- round((wait$gr4hrs/wait$attendances)*100,1)
wait$prop.w4hrs <- wait$w4hrs/wait$attendances
wait$prop.gr4hrs <- wait$gr4hrs/wait$attendances
wait$bet4to8hrs <- wait$gr4hrs - wait$gr8hrs
wait$prop.bet4to8hrs <- wait$bet4to8hrs/wait$attendances
wait$bet8to12hrs <- wait$gr8hrs - wait$gr12hrs
wait$prop.bet8to12hrs <- wait$bet8to12hrs/wait$attendances
wait$prop.gr12hrs <- wait$gr12hrs/wait$attendances
We noticed that observations with no A&E attendances were dropped, so we must impute them back into the dataset.
# Order the data by day
dup <- load[order(load$loc, load$year, load$month, load$day.num), ]
# List wise segregation
split_dup <- split(dup, list(dup$loc, dup$year, dup$month, dup$day.num), drop = TRUE)
hr_vect <- seq(0,23)
# Return missing data
check <- lapply(split_dup, function(df) {
if(length(df$hr.num) != 24) {
return(df)
}
}
)
# Drop the null
check <- check[!sapply(check, is.null)]
# Create new rows in the dataframe containing 0 attendances
dup <- lapply(check, function(df) {
loc <- which(!(hr_vect %in% df$hr.num))
new_row <- df[rep(1,length(loc)),]
new_row$hr.num <- c(loc-1)
return(new_row)
})
# Combine the list back into a dataframe
sam <- do.call(rbind, dup)
sam$attendances <- 0
load <- rbind(load, sam)
# Cumulative months and years
load$cmonth <- (load$year-2018)*12 + load$month
load$chr <- (load$day.num-1)*24 + load$hr.num
rm(split_dup)
rm(check)
rm(dup)
# Check which hospitals have more or less than 482 observations
locw <- unique(wait$loc)
num <- rep(0, length(locw))
for(i in 1:length(locw)) {
num[i] <- nrow(wait[wait$loc == locw[i],])
}
locw[which(num != 482)]
## [1] G405H G306H G516H
## 32 Levels: A111H A210H B120H C121H C313H C418H F704H G107H G306H ... Z102H
# Chose to drop "G306H" "G516H" and inspect "G405H"
id <- which(wait$loc == "G405H" & wait$date == "2015-05-03")
# Sum together attendances and remove duplicates
wait$attendances[id[2]] <- wait$attendances[333] + wait$attendances[336]
wait$w4hrs[id[2]] <- wait$w4hrs[id[1]] + wait$w4hrs[id[2]]
wait$gr4hrs[id[2]] <- wait$gr4hrs[id[1]] + wait$gr4hrs[id[2]]
wait$gr8hrs[id[2]] <- wait$gr8hrs[id[1]] + wait$gr8hrs[id[2]]
wait$gr12hrs[id[2]] <- wait$gr12hrs[id[1]] + wait$gr12hrs[id[2]]
wait$per.w4hrs[id[2]] <- round(wait$w4hrs[id[2]]/wait$attendances[id[2]]*100, 1)
wait$per.gr8hrs[id[2]] <- round(wait$gr8hrs[id[2]]/wait$attendances[id[2]]*100, 1)
wait$per.gr12hrs[id[2]] <- round(wait$gr12hrs[id[2]]/wait$attendances[id[2]]*100, 1)
wait <- wait[-id[1],]
# Remove "G306H" "G516H" from the waiting times data
wait <- subset(wait, loc %in% unique(wait$loc)[-c(16,17)])
The dates in which we put the indicator or covid-19 were chosen based on the dates published here: https://www.covid19inquiry.scot/covid-19-pandemic-scotland-timeline-key-dates however the end date seems to be arbitrary choice which was chosen based on the shift in attention towards other emergencies e.g., war in Ukraine affecting gas prices.
# Create an indicator variable for COVID-19 years
covid <- seq.Date(from=as.Date("2020-03-01"),to=as.Date("2022-03-01"), by="month")
covid <- gsub("-","",covid)
covid <- substr(covid, 1, nchar(covid)-2)
load$covid <- ifelse(load$monthyr %in% covid, 1, 0)
covid_start <- as.Date("2020-03-26")
covid_end <- as.Date("2022-03-26")
wait$covid <- ifelse(wait$date >= covid_start & wait$date <= covid_end, 1, 0)
# Grouping load by month of the year to plot A&E demand by location
load.month <- load %>%
group_by(loc, year, month, day.num, cmonth, covid) %>%
summarize(total_attendances = sum(attendances, na.rm = TRUE))
## `summarise()` has grouped output by 'loc', 'year', 'month', 'day.num',
## 'cmonth'. You can override using the `.groups` argument.
load.month.plot <- load %>%
group_by(loc, year, month, cmonth, covid) %>%
summarize(total_attendances = sum(attendances, na.rm = TRUE))
## `summarise()` has grouped output by 'loc', 'year', 'month', 'cmonth'. You can
## override using the `.groups` argument.
load.month.plot <- load.month.plot %>%
mutate(yearmon = as.yearmon(paste(year, month, sep = "-")))
last_dates <- load.month.plot %>%
group_by(loc) %>%
filter(yearmon == max(yearmon)) %>%
ungroup()
# Create the plot for each A&E demand for each treatment location
p <- ggplot(load.month.plot, aes(x = yearmon, y = total_attendances, color = loc)) +
geom_line(linewidth = 0.3) +
labs(title = "Total A&E Attendances for all Treatment Locations in Scotland",
x = "Date (Year-Month)",
y = "Total Attendances") +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10,margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
axis.title.x = element_text(margin = margin(t = 15))
) +
geom_text_repel(data = last_dates, aes(label = loc),
nudge_x = 0.4, size = 4,
show.legend = FALSE, direction = "y",
segment.size = 0.2, segment.color = "grey50") +
geom_vline(xintercept = as.yearmon("2020-03"), linetype = "dashed", color = "red", size = 0.5) + # Lockdown line
annotate("text", x = as.yearmon("2020-03"), y = max(load.month.plot$total_attendances),
label = "First Nationwide COVID-19 lockdown", color = "red", vjust = -1, hjust = -0.1, size = 3) + # Annotation for lockdown
scale_x_yearmon(format = "%Y-%m", n = 10) +
expand_limits(x = c(min(load.month.plot$yearmon), max(load.month.plot$yearmon)))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p)
filename <- "a&eallloc.png"
ggsave(filename, plot = p, width = 10, height = 6, units = "in", dpi = 300)
# Plot weekly cycle for each month for the first 6 years in each location
par(mfrow = c(2,3))
loc <- unique(load$loc[load$dpttype != "MIU"])
for (j in loc) {
data <- load[load$loc == j,]
for (k in 2018:2023) {
yl <- range(data$attendances)
with(data[data$year==k&data$month==1,], plot(chr[order(chr)], attendances[order(chr)], type="l", ylim = yl, xlab = "Hour of the week", ylab = unique(data$loc), main = k))
for (m in 2:12) with(data[data$year==k&data$month==m,], lines(chr[order(chr)], attendances[order(chr)], col=m))
}
readline()
}
# Cumulative hour for locations
load.cm.plot <- load %>%
filter(loc %in% c("S314H", "T101H", "G405H"), year %in% c(2018, 2020, 2023), month %in% c(1, 4, 7, 10))
# Create the ggplot for the filtered data
e <- ggplot(load.cm.plot, aes(x = chr, y = attendances, color = factor(month))) +
geom_line(size = 0.6) +
facet_grid(scales = "free_y",rows = vars(year), cols = vars(loc)) +
labs(title = "Cumulative Hourly Attendances for Selected Locations and Years",
x = "Cumulative Hour",
y = "Attendances",
color = "Month") +
theme_minimal() +
theme(
axis.title.x = element_text(margin = margin(t = 15)),
legend.position = "bottom",
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
strip.background = element_blank(),
panel.spacing = unit(1, "lines"),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
)
# Print the combined plot
print(e)
filename <- "chr_someloc.png"
ggsave(filename, plot = e, width = 10, height = 6, units = "in", dpi = 300)
# Heatmap of month against cumulative hour
load_agg_data <- load %>%
group_by(month, chr) %>%
filter(year != 2024) %>%
summarise(total_attendances = sum(attendances))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
# Reshape the data into a wide format for the heatmap
heatmap_data <- dcast(load_agg_data, chr ~ month, value.var = "total_attendances")
# Plotting the heatmap
pp <- ggplot(melt(heatmap_data, id.vars = "chr"), aes(x = variable, y = chr, fill = value)) +
geom_tile(color = "white") +
scale_fill_viridis_c(option = "plasma", na.value = "white") +
labs(x = "Month", y = "Cumulative Hour", fill = "Attendances") +
ggtitle("Heatmap of Total A&E Attendances by Cumulative Hour and Month") +
theme_minimal() +
theme(
axis.title.x = element_text(margin = margin(t = 15)),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_text(size = 10),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
strip.background = element_blank(),
panel.spacing = unit(1, "lines"),
)
print(pp)
filename <- "heat.png"
ggsave(filename, plot = pp, width = 10, height = 6, units = "in", dpi = 300)
# Plot percentage of attendances within 4hours
fewew <- ggplot(wait, aes(x = date, y=per.w4hrs)) +
geom_line() +
facet_wrap(~ loc, ncol=5) +
labs(title = "Percentage of Wait Times by within 4 hours from the week of 2015-02-22 to 2024-05-12", y = "Percentage of Attendances within 4 hours",
x="Week end date") +
theme_minimal() +
theme(
axis.title.x = element_text(margin = margin(t = 15)),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.title = element_text(size = 10),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
strip.background = element_blank(),
panel.spacing = unit(1, "lines"),
)
print(fewew)
ggsave("allwaitloc.png", plot = fewew, width = 10, height = 6, units = "in", dpi = 300)
# Define the specific locations to expand the plot
some_loc <- c("S314H", "C418H", "T101H", "C121H")
wait_some_loc <- filtered_wait <- wait %>%
filter(loc %in% some_loc)
q <- ggplot(filtered_wait, aes(x = date, y = per.w4hrs, color = loc)) +
geom_line(size = 0.3) +
labs(
title = "Percentage of Attendances within 4 Hours for Selected Locations",
x = "Date",
y = "Percentage of Attendances within 4 Hours",
color = "ISD Hospital Location Codes"
) +
geom_vline( # Vertical line indicating when covid-19 began
xintercept = as.Date("2020-03-26"),
linetype = "dashed",
color = "red",
size = 0.5
) +
annotate(
"text",
x = as.Date("2020-03-01"),
y = max(filtered_wait$per.w4hrs, na.rm = TRUE),
label = "First Nationwide COVID-19 lockdown",
color = "red",
vjust = -1,
hjust = -0.1,
size = 3
) +
theme_minimal() +
theme(
legend.position = "bottom",
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
axis.title.x = element_text(margin = margin(t = 15))
)
print(q)
filename <- "wait_someloc.png"
ggsave(filename, plot = q, width = 10, height = 6, units = "in", dpi = 300)
# Histogram plot to show the distribution of the data
# Distribution of waiting times
d <- ggplot() +
geom_histogram(data = wait, aes(x = gr4hrs, fill = "Greater than 4 hours"), bins = 50, color = "black", alpha = 0.5) +
geom_histogram(data = wait, aes(x = w4hrs, fill = "Within 4 hours"), bins = 50, color = "black", alpha = 0.5) +
labs(title = "Histogram of weekly A&E wait times",
x = "Value",
y = "Frequency") +
scale_fill_manual(values = c("skyblue", "orange"),
name = "Category",
labels = c("Greater than 4 hours", "Within 4 hours")) +
coord_cartesian(xlim = c(0, 2000)) +
theme_minimal() +
theme(
axis.title.x = element_text(margin = margin(t = 15)),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
panel.spacing = unit(1, "lines"),
legend.position = "bottom",
legend.box = "horizontal",
legend.key.size = unit(0.4, "cm")
)
# Distribution of A&E attendances
d1 <- ggplot(load, aes(x = attendances)) +
geom_histogram(bins = 50, alpha = 0.5, color = "black") +
labs(title = "Histogram of A&E attendances",
x = "Value",
y = "Frequency") +
theme_minimal() +
theme(
axis.title.x = element_text(margin = margin(t = 15)),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
panel.spacing = unit(1, "lines"),
)
d2 <- grid.arrange(d, d1, ncol = 2)
print(d2)
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
# filename <- "hist.png"
# ggsave(filename, plot = d2, width = 10, height = 6, units = "in", dpi = 300)
# Create a rolling window using 6 months for testing and the rest for training
train.load.1 <- load %>% filter(cmonth <= 67)
train.load.2 <- load %>% filter(cmonth >= 2, cmonth <= 68)
train.load.3 <- load %>% filter(cmonth >= 3, cmonth <= 69)
test.load.1 <- load %>% filter(cmonth >= 68, cmonth <= 73)
test.load.2 <- load %>% filter(cmonth >= 69, cmonth <= 74)
test.load.3 <- load %>% filter(cmonth >= 70)
# Train each model specification on each rolling window and examine performance on test dataset
list.train.load <- list(train.load.1, train.load.2, train.load.3)
list.test.load <- list(test.load.1, test.load.2, test.load.3)
knots <- list(day.num=c(1,8), hr.num=c(0,24), chr=c(0,168), month=c(0,12))
# Initialize lists
hr.rmse.results <- list(m1 = list(), m2 = list(), m3 = list())
m1.hr.fit <- list(); m2.hr.fit <- list(); m3.hr.fit <- list()
# Fitting model 1
for (i in 1:3) {
m1.hr.fit[[i]] <- bam(attendances ~ loc + covid + s(day.num, k=7, bs="cc") + s(chr, k=80, bs="ad") + s(hr.num, k=20, bs="cc") + te(cmonth, month, k=c(40,10), bs=c("cr","cc")), family = tw, data = list.train.load[[i]], method= "fREML",knots = knots, discrete = TRUE)
hr.rmse.results$m1[[i]] <- list(rmse = rmse.load.hr(m1.hr.fit[[i]], list.test.load[[i]]), data_trained_on = paste("train.load.", i, sep = ""))
}
# Fiting model 2
for (i in 1:3) {
m2.hr.fit[[i]] <- bam(attendances ~ loc + covid + as.factor(inout) + s(year, k=6, bs="cr") + s(chr, k=100, bs="ad") + s(hr.num, k=20, bs="cc") + te(cmonth, month, k=c(40,10), bs=c("cr","cc")),
family = tw, data = list.train.load[[i]], knots = knots, discrete = TRUE)
hr.rmse.results$m2[[i]] <- list(rmse = rmse.load.hr(m2.hr.fit[[i]], list.test.load[[i]]), data_trained_on = paste("train.load.", i, sep = ""))
}
# Fitting model 3
for (i in 1:3) {
m3.hr.fit[[i]] <- bam(attendances ~ s(loc, k=30, bs="re") + covid + inout + te(chr, hr.num, k=c(100,20), bs=c("cc","cc")) + te(cmonth, month, k=c(40,10), bs=c("cr","cc")),family = tw, data = list.train.load[[i]], knots = knots, discrete = TRUE)
hr.rmse.results$m3[[i]] <- list(rmse = rmse.load.hr(m3.hr.fit[[i]], list.test.load[[i]]), data_trained_on = paste("train.load.", i, sep = ""))
}
# Print RMSE results for each model
print(hr.rmse.results)
## $m1
## $m1[[1]]
## $m1[[1]]$rmse
## [1] 7.363933
##
## $m1[[1]]$data_trained_on
## [1] "train.load.1"
##
##
## $m1[[2]]
## $m1[[2]]$rmse
## [1] 8.183022
##
## $m1[[2]]$data_trained_on
## [1] "train.load.2"
##
##
## $m1[[3]]
## $m1[[3]]$rmse
## [1] 7.41899
##
## $m1[[3]]$data_trained_on
## [1] "train.load.3"
##
##
##
## $m2
## $m2[[1]]
## $m2[[1]]$rmse
## [1] 7.420417
##
## $m2[[1]]$data_trained_on
## [1] "train.load.1"
##
##
## $m2[[2]]
## $m2[[2]]$rmse
## [1] 7.713513
##
## $m2[[2]]$data_trained_on
## [1] "train.load.2"
##
##
## $m2[[3]]
## $m2[[3]]$rmse
## [1] 7.305569
##
## $m2[[3]]$data_trained_on
## [1] "train.load.3"
##
##
##
## $m3
## $m3[[1]]
## $m3[[1]]$rmse
## [1] 7.360321
##
## $m3[[1]]$data_trained_on
## [1] "train.load.1"
##
##
## $m3[[2]]
## $m3[[2]]$rmse
## [1] 8.184336
##
## $m3[[2]]$data_trained_on
## [1] "train.load.2"
##
##
## $m3[[3]]
## $m3[[3]]$rmse
## [1] 7.414629
##
## $m3[[3]]$data_trained_on
## [1] "train.load.3"
# Calculate and print mean RMSE for each model specification
load.mean.rmse <- list(
m1 = mean(sapply(hr.rmse.results$m1, function(x) x$rmse)),
m2 = mean(sapply(hr.rmse.results$m2, function(x) x$rmse)),
m3 = mean(sapply(hr.rmse.results$m3, function(x) x$rmse))
)
print(load.mean.rmse)
## $m1
## [1] 7.655315
##
## $m2
## [1] 7.479833
##
## $m3
## [1] 7.653095
# Compute the baseline forecast
baseline.rmse.load.1 <- load.baseline.rmse(train.load.1, test.load.1)
baseline.rmse.load.2 <- load.baseline.rmse(train.load.2, test.load.2)
baseline.rmse.load.3 <- load.baseline.rmse(train.load.3, test.load.3)
baseline.rmse.load.1; baseline.rmse.load.2; baseline.rmse.load.3
## [1] 7.849163
## [1] 7.788533
## [1] 7.910597
# Average the baselines
(baseline.rmse.load.1 + baseline.rmse.load.2 + baseline.rmse.load.3)/3
## [1] 7.849431
# Compare AIC of the two models with RMSE lower than the baseline
load.aic <- list(
m2 = sapply(m2.hr.fit, function(x) AIC(x)),
m3 = sapply(m3.hr.fit, function(x) AIC(x))
)
print(load.aic)
## $m2
## [1] 1861597 1973448 1975227
##
## $m3
## [1] 1971333 1972571 1974363
appraise(m1.hr.fit[[1]])
appraise(m1.hr.fit[[2]])
appraise(m1.hr.fit[[3]])
appraise(m2.hr.fit[[1]])
appraise(m2.hr.fit[[2]])
appraise(m2.hr.fit[[3]])
appraise(m3.hr.fit[[1]])
appraise(m3.hr.fit[[2]])
appraise(m3.hr.fit[[3]])
# Create new prediction dataframe
load.hr.forecast <- load[load$month %in% c(4:9) & load$year == 2023,]
oldcmon <- sort(unique(load.hr.forecast$cmonth))
newcmon <- 76:81
for(i in 1:6) {
load.hr.forecast$cmonth[load.hr.forecast$cmonth == oldcmon[i]] <- newcmon[i]
}
load.hr.forecast$year <- 2024; load.hr.forecast <- load.hr.forecast[,-9]
# Retrain model with all data
m2.hr.full <- bam(attendances ~ loc + covid + inout + s(year, k=6, bs="cr") + s(chr, k=100, bs="ad") + s(hr.num, k=20, bs="cc") + te(cmonth, month, k=c(40,10), bs=c("cr","cc")),
family = tw, data = load, knots = knots, discrete = TRUE)
load.hr.forecast <- cbind(load.hr.forecast, predict.bam(m2.hr.full,load.hr.forecast,type="response"))
names(load.hr.forecast)[16] <- "fit"
draw(m2.hr.full, rug = NULL)
summary(m2.hr.full); appraise(m2.hr.full)
##
## Family: Tweedie(p=1.105)
## Link function: log
##
## Formula:
## attendances ~ loc + covid + inout + s(year, k = 6, bs = "cr") +
## s(chr, k = 100, bs = "ad") + s(hr.num, k = 20, bs = "cc") +
## te(cmonth, month, k = c(40, 10), bs = c("cr", "cc"))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.260430 0.033943 96.057 < 2e-16 ***
## locA210H -0.654967 0.003577 -183.087 < 2e-16 ***
## locB120H -0.758566 0.003687 -205.722 < 2e-16 ***
## locC121H -2.071722 0.005814 -356.353 < 2e-16 ***
## locC313H -0.768445 0.003698 -207.790 < 2e-16 ***
## locC418H -0.069657 0.003073 -22.666 < 2e-16 ***
## locF704H 0.013497 0.003016 4.475 7.63e-06 ***
## locG107H 0.289653 0.002847 101.741 < 2e-16 ***
## locG405H 0.394168 0.002791 141.225 < 2e-16 ***
## locG513H 0.025097 0.003008 8.343 < 2e-16 ***
## locH103H -1.993416 0.005641 -353.381 < 2e-16 ***
## locH202H -0.595155 0.003517 -169.225 < 2e-16 ***
## locH212H -1.965699 0.005581 -352.182 < 2e-16 ***
## locL106H 0.033876 0.003002 11.284 < 2e-16 ***
## locL302H -0.031269 0.003046 -10.264 < 2e-16 ***
## locL308H 0.108727 0.002954 36.807 < 2e-16 ***
## locN101H -0.149163 0.003131 -47.640 < 2e-16 ***
## locN121H -1.413557 0.004560 -310.024 < 2e-16 ***
## locN411H -0.904659 0.003854 -234.705 < 2e-16 ***
## locR103H -2.295874 0.006348 -361.695 < 2e-16 ***
## locS308H -0.136461 0.003122 -43.715 < 2e-16 ***
## locS314H 0.619116 0.002684 230.672 < 2e-16 ***
## locS319H -0.237528 0.003199 -74.249 < 2e-16 ***
## locT101H -0.187135 0.003160 -59.223 < 2e-16 ***
## locT202H -1.043429 0.004027 -259.110 < 2e-16 ***
## locV217H -0.068233 0.003072 -22.210 < 2e-16 ***
## locW107H -2.425733 0.006685 -362.844 < 2e-16 ***
## locY144H -1.628212 0.004922 -330.819 < 2e-16 ***
## locY146H -0.692757 0.003617 -191.545 < 2e-16 ***
## locZ102H -2.177201 0.006057 -359.428 < 2e-16 ***
## covid 0.121637 0.055701 2.184 0.029 *
## inoutOut of Hours 0.010586 0.005946 1.780 0.075 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year) 1.011 1.011 0.894 0.342
## s(chr) 62.024 68.775 335.443 <2e-16 ***
## s(hr.num) 17.955 18.000 1827.486 <2e-16 ***
## te(month,cmonth) 71.377 71.973 603.829 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.89 Deviance explained = 89.8%
## fREML = 5.9391e+05 Scale est. = 1.2431 n = 378000
# Obtain confidence intervals by simulating from the posterior
# Reorder the data
load.hr.forecast <- load.hr.forecast[order(load.hr.forecast$loc, load.hr.forecast$cmonth, load.hr.forecast$chr),]
# Obtain lp matrix and simulate from the posterior
X_P <- predict(m2.hr.full, load.hr.forecast, type="lpmatrix")
n <- 5000
br <- rmvn(n, coef(m2.hr.full), vcov(m2.hr.full))
# Compute the linear predictor
tmp <- list()
for(i in 1:n) {
l_p <- X_P %*% br[i,]
tmp[[i]] <- exp(l_p)
}
# Sum the observations by hospital location and cmonth
sums_list <- lapply(tmp, sum_168)
identify <- load.hr.forecast %>%
group_by(loc, cmonth) %>%
summarise()
## `summarise()` has grouped output by 'loc'. You can override using the `.groups`
## argument.
forecast.CI.mat <- do.call(cbind,sums_list)
forecast.CI.df <- as.data.frame(forecast.CI.mat)
forecast.CI.df <- cbind(identify, forecast.CI.df)
# Compute credible interval
quantiles <- apply(forecast.CI.df[,3:5002],1,function(x) quantile(x,c(0.025,0.975)))
identify$lower <- quantiles[1,]
identify$upper <- quantiles[2,]
# Sum forecasts and merge to get the CI
load.mon.forecast <- load.hr.forecast %>%
group_by(loc, cmonth, year, month) %>%
summarise(total_attendances = sum(fit)) %>%
mutate(yearmon = as.yearmon(paste(year, month, sep="-")))
## `summarise()` has grouped output by 'loc', 'cmonth', 'year'. You can override
## using the `.groups` argument.
load.mon.forecast <- merge(load.mon.forecast, identify, by=c("loc", "cmonth"))
# Identifer dataframe
iden.scotland <- load.hr.forecast %>%
group_by(cmonth) %>%
summarise()
# Obtain the credible interval for the whole of scotland
scotland_load <- lapply(sums_list, sum_by_position)
forecast.CI.scotland <- do.call(cbind,scotland_load)
forecast.CI.scotland.df <- as.data.frame(forecast.CI.scotland)
forecast.CI.scotland.df <- cbind(iden.scotland, forecast.CI.scotland.df)
# Compute credible interval
quantiles <- apply(forecast.CI.scotland.df[,2:5001],1,function(x) quantile(x,c(0.025,0.975)))
iden.scotland$lower <- quantiles[1,]
iden.scotland$upper <- quantiles[2,]
# Sum forecasts and merge to get the CI
load.scotland.forecast <- load.hr.forecast %>%
group_by(cmonth, year, month) %>%
summarise(total_attendances = sum(fit)) %>%
mutate(yearmon = as.yearmon(paste(year, month, sep="-")))
## `summarise()` has grouped output by 'cmonth', 'year'. You can override using
## the `.groups` argument.
load.scotland.forecast <- merge(load.scotland.forecast, iden.scotland, by=c("cmonth"))
# Transform all historical data into monthly totals
load.month.plot <- load %>%
group_by(loc, year, month, cmonth, covid) %>%
summarize(total_attendances = sum(attendances, na.rm = TRUE)) %>%
mutate(yearmon = as.yearmon(paste(year,month,sep="-")))
## `summarise()` has grouped output by 'loc', 'year', 'month', 'cmonth'. You can
## override using the `.groups` argument.
# Combine historical data with forecasts along with the CI
load.month.hr.full <- bind_rows(
load.month.plot %>% mutate(source = "Actual"),
load.mon.forecast %>% select(loc, yearmon, total_attendances, lower, upper) %>%
mutate(source = "Forecast")
)
# Filter the data to use historical data from 2023
load.month.hr.full.2023 <- load.month.hr.full %>%
filter(yearmon >= as.yearmon("2023-01"))
# Create the plot
qq <- ggplot(load.month.hr.full.2023, aes(x = yearmon, y = total_attendances, color = source)) +
geom_line(data = load.month.hr.full.2023 %>% filter(source == "Actual"), size = 0.3) +
geom_line(data = load.month.hr.full.2023 %>% filter(source == "Forecast"), size = 0.3, linetype = "dashed") +
geom_ribbon(data = load.month.hr.full.2023 , aes(x = yearmon, ymin = lower, ymax = upper, fill = loc), alpha = 0.2, inherit.aes = FALSE) +
labs(
title = "Total A&E Attendances and Forecasts by Location",
x = "Date (Year-Month)",
y = "Total Attendances"
) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
axis.title.x = element_text(margin = margin(t = 15)),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
) +
scale_x_yearmon(format = "%Y-%m", n = 10, breaks = seq(min(load.month.hr.full.2023$yearmon), max(load.month.hr.full.2023$yearmon), by = 6/12)) +
facet_wrap(~ loc, scales = "free_y", ncol = 6)
print(qq)
# Save the plot
ggsave("demand_forecast_all.png", plot = qq, width = 12, height = 8, dpi = 300)
# Select some locations to plot
load.plot.some.2023 <- load.month.hr.full.2023 %>% filter(loc %in% c("S314H", "C418H", "T101H", "C121H"))
load.plot.some.forecast <- load.plot.some.2023 %>% filter(source == "Forecast")
load.plot.some.actual <- load.plot.some.2023 %>% filter(source == "Actual")
qqq <- ggplot(load.plot.some.2023 , aes(x = yearmon, y = total_attendances, color = loc)) +
geom_line(data = load.plot.some.actual, aes(color = loc), linewidth = 0.3) + # Actual data lines
geom_line(data = load.plot.some.forecast, aes(color = loc), linewidth = 0.3, linetype = "dashed") + # Forecast data lines
geom_ribbon(data = load.plot.some.2023 , aes(x = yearmon, ymin = lower, ymax = upper, fill = loc),
alpha = 0.2, inherit.aes = FALSE) + # Credible intervals
labs(title = "Monthly Forecast for A&E Attendances in Selected Treatment Locations in Scotland",
x = "Date (Year-Month)",
y = "Total Attendances") +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
axis.title.x = element_text(margin = margin(t = 15))
) +
scale_x_yearmon(format = "%Y-%m", n = 10, breaks = seq(min(load.plot.some.2023$yearmon), max(load.plot.some.2023$yearmon), by = 4/12)) +
expand_limits(x = c(min(load.plot.some.2023$yearmon), max(load.plot.some.2023$yearmon))) +
facet_wrap(~ loc, scales = "free_y") # Create separate panels for each location
# Print the plot
print(qqq)
ggsave("load.someforecast.png", plot = qqq, width = 10, height = 6, units = "in", dpi = 300)
# Forecast for whole of scotland
historical_scotland <- load %>%
group_by(year, month) %>%
summarize(total_attendances = sum(attendances, na.rm = TRUE)) %>%
mutate(yearmon = as.yearmon(paste(year, month, sep = "-")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Combine historical data with forecasts along with the CI
load.scotland.full <- bind_rows(
historical_scotland %>% mutate(source = "Actual"),
load.scotland.forecast %>% select(yearmon, total_attendances, lower, upper) %>%
mutate(source = "Forecast")
)
# Filter the data to use historical data from 2023
load.scotland.full.2023 <- load.scotland.full %>%
filter(yearmon >= as.yearmon("2023-01"))
scotland_plot <- ggplot(load.scotland.full.2023, aes(x = yearmon, y = total_attendances, color = source)) +
geom_line(data = load.scotland.full.2023 %>% filter(source == "Actual"), size = 0.3) +
geom_line(data = load.scotland.full.2023 %>% filter(source == "Forecast"), size = 0.3, linetype = "dashed") +
geom_ribbon(data = load.scotland.full.2023 %>% filter(source == "Forecast"),
aes(x = yearmon, ymin = lower, ymax = upper), alpha = 0.2, inherit.aes = FALSE) +
labs(
title = "Total A&E Attendances and Forecasts for Scotland",
x = "Date (Year-Month)",
y = "Total Attendances"
) +
theme_minimal() +
theme(
legend.position = "bottom",
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
axis.title.x = element_text(margin = margin(t = 15)),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
) +
scale_x_yearmon(format = "%Y-%m", n = 6, breaks = seq(min(load.scotland.full.2023$yearmon), max(load.scotland.full.2023$yearmon), by = 1/12))
print(scotland_plot)
ggsave("scotlandforecast.png", plot = scotland_plot, width = 10, height = 6, units = "in", dpi = 300)
# Create a rolling window and 6 month test
train.wait.1 <- wait %>% filter(julian <= 3087)
train.wait.2 <- wait %>% filter(julian >= 56, julian <= 3143)
train.wait.3 <- wait %>% filter(julian >= 112, julian <= 3199)
test.wait.1 <- wait %>% filter(julian >= 3094, julian <= 3255)
test.wait.2 <- wait %>% filter(julian >= 3150, julian <= 3311)
test.wait.3 <- wait %>% filter(julian >= 3206)
# Train each model specification on each rolling window and examine performance on test dataset
list.train.wait <- list(train.wait.1, train.wait.2, train.wait.3)
list.test.wait <- list(test.wait.1, test.wait.2, test.wait.3)
knots.wait <- list(dom=c(0,31), month=c(0,12))
# Initialize lists for model fits and results
m1.wait.fit <- list(); m2.wait.fit <- list(); m3.wait.fit <- list()
wait.rmse.results <- list(m1 = list(), m2 = list(), m3 = list())
# Model 1 training
for (i in 1:3) {
m1.wait.fit[[i]] <- bam(cbind(w4hrs, gr4hrs) ~ covid + HBT + s(loc, k=30, bs="re") + s(julian, k=80, bs="ad") + te(dom, month, k=c(30,12), bs=c("cc","cc")),family=quasibinomial, data=list.train.wait[[i]], knots=knots.wait)
wait.rmse.results$m1[[i]] <- list(rmse = rmse.wait(m1.wait.fit[[i]], list.test.wait[[i]]),data_trained_on = paste("train.wait.", i, sep=""))
}
# Model 2 training
for(i in 1:3) {
m2.wait.fit[[i]] <- bam(cbind(w4hrs, gr4hrs) ~ covid + s(year, k=8, bs="cr") + s(loc, k=30, bs="re") + s(julian, k=80, bs="ad") + te(dom, month, k=c(30,12), bs=c("cc","cc")),family=quasibinomial, data=list.train.wait[[i]], knots=knots.wait)
wait.rmse.results$m2[[i]] <- list(rmse = rmse.wait(m2.wait.fit[[i]], list.test.wait[[i]]),data_trained_on = paste("train.wait.", i, sep=""))
}
# Model 3 training
for(i in 1:3) {
m3.wait.fit[[i]] <- bam(cbind(w4hrs, gr4hrs) ~ covid + loc + s(year, k=8, bs="cr") + s(julian, k=80, bs="ad") + te(dom, month, k=c(30,12), bs=c("cc","cc")),family=quasibinomial, data=list.train.wait[[i]], knots=knots.wait)
wait.rmse.results$m3[[i]] <- list(rmse = rmse.wait(m3.wait.fit[[i]], list.test.wait[[i]]), data_trained_on = paste("train.wait.", i, sep=""))
}
print(wait.rmse.results)
## $m1
## $m1[[1]]
## $m1[[1]]$rmse
## [1] 0.0911247
##
## $m1[[1]]$data_trained_on
## [1] "train.wait.1"
##
##
## $m1[[2]]
## $m1[[2]]$rmse
## [1] 0.06265235
##
## $m1[[2]]$data_trained_on
## [1] "train.wait.2"
##
##
## $m1[[3]]
## $m1[[3]]$rmse
## [1] 0.08818964
##
## $m1[[3]]$data_trained_on
## [1] "train.wait.3"
##
##
##
## $m2
## $m2[[1]]
## $m2[[1]]$rmse
## [1] 0.08645065
##
## $m2[[1]]$data_trained_on
## [1] "train.wait.1"
##
##
## $m2[[2]]
## $m2[[2]]$rmse
## [1] 0.08607701
##
## $m2[[2]]$data_trained_on
## [1] "train.wait.2"
##
##
## $m2[[3]]
## $m2[[3]]$rmse
## [1] 0.0613574
##
## $m2[[3]]$data_trained_on
## [1] "train.wait.3"
##
##
##
## $m3
## $m3[[1]]
## $m3[[1]]$rmse
## [1] 0.08633277
##
## $m3[[1]]$data_trained_on
## [1] "train.wait.1"
##
##
## $m3[[2]]
## $m3[[2]]$rmse
## [1] 0.08598067
##
## $m3[[2]]$data_trained_on
## [1] "train.wait.2"
##
##
## $m3[[3]]
## $m3[[3]]$rmse
## [1] 0.06127696
##
## $m3[[3]]$data_trained_on
## [1] "train.wait.3"
# Calculate and store mean RMSE for each model specification
wait.mean.rmse <- list(
m1 = mean(sapply(wait.rmse.results$m1, function(x) x$rmse)),
m2 = mean(sapply(wait.rmse.results$m2, function(x) x$rmse)),
m3 = mean(sapply(wait.rmse.results$m3, function(x) x$rmse))
)
print(wait.mean.rmse)
## $m1
## [1] 0.08065557
##
## $m2
## [1] 0.07796169
##
## $m3
## [1] 0.07786347
# Baseline forecast
baseline.rmse.wait.1 <- wait.baseline.rmse(train.wait.1, test.wait.1)
baseline.rmse.wait.2 <- wait.baseline.rmse(train.wait.2, test.wait.2)
baseline.rmse.wait.3 <- wait.baseline.rmse(train.wait.3, test.wait.3)
baseline.rmse.wait.1; baseline.rmse.wait.2; baseline.rmse.wait.3
## [1] 0.07517775
## [1] 0.0801787
## [1] 0.08241553
(baseline.rmse.wait.1 + baseline.rmse.wait.2 + baseline.rmse.wait.3)/3
## [1] 0.07925733
# Anova table using F-ratio test
anova(m1.wait.fit[[3]], m2.wait.fit[[3]], m3.wait.fit[[3]],test="F")
## Analysis of Deviance Table
##
## Model 1: cbind(w4hrs, gr4hrs) ~ covid + HBT + s(loc, k = 30, bs = "re") +
## s(julian, k = 80, bs = "ad") + te(dom, month, k = c(30, 12),
## bs = c("cc", "cc"))
## Model 2: cbind(w4hrs, gr4hrs) ~ covid + s(year, k = 8, bs = "cr") + s(loc,
## k = 30, bs = "re") + s(julian, k = 80, bs = "ad") + te(dom,
## month, k = c(30, 12), bs = c("cc", "cc"))
## Model 3: cbind(w4hrs, gr4hrs) ~ covid + loc + s(year, k = 8, bs = "cr") +
## s(julian, k = 80, bs = "ad") + te(dom, month, k = c(30, 12),
## bs = c("cc", "cc"))
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 13060 216034
## 2 13082 216657 -21.85610 -622.58 1.6588 0.02738 *
## 3 13082 216658 -0.12476 -0.61 0.2828 0.19747
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
appraise(m1.wait.fit[[1]])
appraise(m1.wait.fit[[2]])
appraise(m1.wait.fit[[3]])
appraise(m2.wait.fit[[1]])
appraise(m2.wait.fit[[2]])
appraise(m2.wait.fit[[3]])
appraise(m3.wait.fit[[1]])
appraise(m3.wait.fit[[2]])
appraise(m3.wait.fit[[3]])
# Create forecast dataframe
wait.forecast <- test.wait.3[,-c(1,4:12,18:25)]
new_dates <- seq.Date(from = as.Date("2024-05-19"), by = "week", length.out = 24)
new_julian <- seq(3374, 3535, 7)
wait.forecast <- wait.forecast %>%
group_by(loc) %>%
mutate(date = rep(new_dates, length.out = n()),
julian = rep(new_julian, length.out = n())) %>%
ungroup()
wait.forecast$year <- 2024
wait.forecast$month <- month(wait.forecast$date)
wait.forecast$dom <- day(wait.forecast$date)
# Rerun the model with the full dataset to make the forecast
m3.wait.full <- bam(cbind(w4hrs, gr4hrs) ~ covid + loc + s(year, k=8, bs="cr") + s(julian, k=80, bs="ad") + te(dom, month, k=c(30,12), bs=c("cc","cc")),family=quasibinomial, data=wait, knots=knots.wait)
wait.forecasted.values <- cbind(wait.forecast, predict(m3.wait.full,wait.forecast,se=TRUE,type="response"))
summary(m3.wait.full)
##
## Family: quasibinomial
## Link function: logit
##
## Formula:
## cbind(w4hrs, gr4hrs) ~ covid + loc + s(year, k = 8, bs = "cr") +
## s(julian, k = 80, bs = "ad") + te(dom, month, k = c(30, 12),
## bs = c("cc", "cc"))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9469613 0.0210401 92.536 < 2e-16 ***
## covid 0.0638433 0.0641535 0.995 0.31967
## locA210H -0.2555106 0.0257185 -9.935 < 2e-16 ***
## locB120H -0.0666821 0.0276830 -2.409 0.01602 *
## locC121H 1.4713476 0.0760765 19.340 < 2e-16 ***
## locC313H 0.2822879 0.0296670 9.515 < 2e-16 ***
## locC418H -0.3844698 0.0216505 -17.758 < 2e-16 ***
## locF704H 0.0153675 0.0223041 0.689 0.49084
## locG107H -0.4515662 0.0199947 -22.584 < 2e-16 ***
## locG405H -0.7542473 0.0192873 -39.106 < 2e-16 ***
## locG513H 1.6576712 0.0320931 51.652 < 2e-16 ***
## locH103H 0.8805456 0.0588858 14.953 < 2e-16 ***
## locH202H 0.1374605 0.0271314 5.066 4.1e-07 ***
## locH212H 0.9413024 0.0594708 15.828 < 2e-16 ***
## locL106H -0.0599118 0.0219877 -2.725 0.00644 **
## locL302H -0.4698573 0.0213032 -22.056 < 2e-16 ***
## locL308H -0.4832920 0.0207319 -23.312 < 2e-16 ***
## locN101H -0.4729742 0.0219244 -21.573 < 2e-16 ***
## locN121H 1.4924297 0.0560049 26.648 < 2e-16 ***
## locN411H 0.3618316 0.0316816 11.421 < 2e-16 ***
## locR103H 1.4097837 0.0828073 17.025 < 2e-16 ***
## locS308H 0.0003079 0.0231784 0.013 0.98940
## locS314H -0.6932405 0.0186649 -37.141 < 2e-16 ***
## locS319H 1.4644176 0.0333140 43.958 < 2e-16 ***
## locT101H 1.2196729 0.0306209 39.831 < 2e-16 ***
## locT202H 1.4128056 0.0468019 30.187 < 2e-16 ***
## locV217H -0.5957136 0.0212090 -28.088 < 2e-16 ***
## locW107H 2.2742350 0.1299101 17.506 < 2e-16 ***
## locY144H 0.8567995 0.0492861 17.384 < 2e-16 ***
## locY146H 0.2717760 0.0286812 9.476 < 2e-16 ***
## locZ102H 1.2926061 0.0743009 17.397 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year) 6.24 6.702 6.111 1.41e-05 ***
## s(julian) 61.06 66.964 44.626 < 2e-16 ***
## te(dom,month) 74.67 309.000 1.292 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.895 Deviance explained = 87.5%
## fREML = 41449 Scale est. = 17.261 n = 14460
appraise(m3.wait.full); draw(m3.wait.full, rug=NULL)
# Compute the confidence intervals
wait.forecasted.values <- wait.forecasted.values %>%
mutate(lower_ci = fit - 1.96 * se.fit,
upper_ci = fit + 1.96 * se.fit)
# Bind the forecast with the data
wait.full <- bind_rows(
wait %>% mutate(source = "Actual"),
wait.forecasted.values %>% select(loc, date, fit, lower_ci, upper_ci) %>%
rename(prop.w4hrs = fit) %>%
mutate(source = "Forecast")
)
wait.full.all.loc <- wait.full %>%
filter(date >= as.Date("2023-01-01"))
wait.forecasted.values.plot <- wait.forecasted.values %>%
mutate(source = "Forecast")
wait.full.plot <- bind_rows(
wait.full.all.loc %>% mutate(source = "Actual"),
wait.forecasted.values.plot %>% select(loc, date, fit, lower_ci, upper_ci, source)
)
pew <- ggplot(wait.full.plot, aes(x = date, y = prop.w4hrs, color = loc)) +
geom_line(data = wait.full.plot %>% filter(source == "Actual"), size = 0.3) + # Actual data lines
geom_line(data = wait.full.plot %>% filter(source == "Forecast"), size = 0.3, linetype = "dashed") + # Forecast data lines
geom_ribbon(data = wait.full.plot %>% filter(source == "Forecast"),
aes(x = date, ymin = lower_ci, ymax = upper_ci, fill = loc), alpha = 0.2, inherit.aes = FALSE, show.legend = FALSE) +
labs(
title = "Forecast of the Percentage of Attendances within 4 Hours for All Locations",
x = "Date",
y = "Percentage of Attendances within 4 Hours"
) +
theme_minimal() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
axis.title.x = element_text(margin = margin(t = 15)),
legend.title = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "none"
) +
scale_x_date(date_labels = "%Y-%m", date_breaks = "6 months") +
facet_wrap(~ loc, scales = "free_y", ncol = 5) + # 5x6 grid
scale_y_continuous(labels = function(x) sprintf("%.1f", x), limits = c(0, 1))
# Print the plot
print(pew)
## Warning: Removed 720 rows containing missing values or values outside the scale range
## (`geom_line()`).
ggsave("wait_forecast_all.png", plot = pew, width = 12, height = 8, dpi = 300)
## Warning: Removed 720 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Plot a selected few hospital locations
wait.full.plot.some <- wait.full.plot %>%
filter(loc %in% c("S314H", "C418H", "T101H", "C121H"))
pew_1 <- ggplot(wait.full.plot.some, aes(x = date, y = prop.w4hrs, color = loc)) +
geom_line(data = wait.full.plot.some %>% filter(source == "Actual"), size = 0.3) + # Actual data lines
geom_line(data = wait.full.plot.some %>% filter(source == "Forecast"), size = 0.3, linetype = "dashed") + # Forecast data lines
geom_ribbon(data = wait.full.plot.some %>% filter(source == "Forecast"),
aes(x = date, ymin = lower_ci, ymax = upper_ci, fill = loc), alpha = 0.2, inherit.aes = FALSE, show.legend = FALSE) +
labs(
title = "Forecast of the Percentage of Attendances within 4 Hours in Selected Hospital Locations",
x = "Date",
y = "Percentage of Attendances within 4 Hours"
) +
theme_minimal() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 10, margin = margin(t = 20)),
axis.text = element_text(size = 8),
strip.text = element_text(size = 8),
axis.title.x = element_text(margin = margin(t = 15)),
legend.title = element_blank(),
legend.position = "none"
) +
scale_x_date(date_labels = "%Y-%m", date_breaks = "6 months") +
facet_wrap(~ loc, scales = "free_y", ncol = 2) +
scale_y_continuous(labels = function(x) sprintf("%.1f", x))
# Print the plot
print(pew_1)
## Warning: Removed 96 rows containing missing values or values outside the scale range
## (`geom_line()`).
ggsave("wait_forecast_someloc.png", plot = pew_1, width = 12, height = 8, dpi = 300)
## Warning: Removed 96 rows containing missing values or values outside the scale range
## (`geom_line()`).